library("sleuth")
## Loading required package: ggplot2
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
base_dir <- "../results"
Load subsamples
all_ss_directory <- Sys.glob("../results/ss*")
# sanity check
temp <- sapply(all_ss_directory, function(x) {
strsplit(x, '/')[[1]][3]
}, USE.NAMES = FALSE)
all.equal(temp, all_ss_id$sample)
## [1] "35 string mismatches"
# all_ss_id <- mutate(all_ss_id, path = all_ss_directory)
so <- sleuth_prep(all_ss_directory, all_ss_id, ~1)
## reading in kallisto results
## ........................................
##
## normalizing est_counts
## 81208 targets passed the filter
## normalizing tpm
## merging in metadata
## normalizing bootstrap samples
# so <- sleuth_prep(all_ss_id, ~1)
ss_summary <- so$obs_norm %>%
group_by(target_id) %>%
summarise(
mean_tpm = mean(tpm),
sd_tpm = sd(tpm),
var_tpm = var(tpm),
cv_tpm = sd_tpm / mean_tpm,
mean_est_counts = mean(est_counts),
sd_est_counts = sd(est_counts),
var_est_counts = var(est_counts),
cv_est_counts = sd_est_counts / mean_est_counts
)
ss_summary
## Source: local data frame [198,457 x 9]
##
## target_id mean_tpm sd_tpm var_tpm cv_tpm
## (chr) (dbl) (dbl) (dbl) (dbl)
## 1 ENST00000000233 107.7728992 3.42522541 11.732169140 0.03178188
## 2 ENST00000000412 55.5685781 2.58297827 6.671776760 0.04648271
## 3 ENST00000000442 20.0162404 1.64091801 2.692611903 0.08197933
## 4 ENST00000001008 112.2844862 1.21094225 1.466381144 0.01078459
## 5 ENST00000001146 0.3376546 0.20809803 0.043304792 0.61630450
## 6 ENST00000002125 7.7191243 0.67581750 0.456729298 0.08755106
## 7 ENST00000002165 63.3896571 1.16449783 1.356055197 0.01837047
## 8 ENST00000002501 25.2559906 1.02167625 1.043822358 0.04045283
## 9 ENST00000002596 0.1693633 0.02921479 0.000853504 0.17249778
## 10 ENST00000002829 3.0708259 0.63745593 0.406350064 0.20758452
## .. ... ... ... ... ...
## Variables not shown: mean_est_counts (dbl), sd_est_counts (dbl),
## var_est_counts (dbl), cv_est_counts (dbl)
aggregate by genes
mart <- biomaRt::useMart(biomart = "ENSEMBL_MART_ENSEMBL",
dataset = "hsapiens_gene_ensembl", host="sep2015.archive.ensembl.org")
## Creating a generic function for 'nchar' from package 'base' in package 'S4Vectors'
t2g <- biomaRt::getBM(
attributes = c("ensembl_transcript_id", "ensembl_gene_id",
"external_gene_name"), mart = mart)
t2g <- dplyr::rename(t2g,
target_id = ensembl_transcript_id, ens_gene = ensembl_gene_id)
gene_subsample_counts <- so$obs_norm %>%
left_join(t2g, by = "target_id") %>%
group_by(ens_gene, sample) %>%
summarize(sum = sum(est_counts)) %>%
summarize(mean_est_counts = mean(sum),
var_est_counts = var(sum),
sd_est_counts = sd(sum))
Load bootstrap
## Found 40 bootstrap samples
collapse_genes <- function(data, mapping) {
data <- left_join(data, mapping, by = 'target_id')
data %>%
group_by(ens_gene) %>%
summarize(
est_counts = sum(est_counts),
tpm = sum(tpm)) %>%
rename(target_id = ens_gene)
}
bs_kal_gene <- bs_kal
bs_kal_gene$bootstrap <- lapply(bs_kal_gene$bootstrap,
function(y) {
collapse_genes(y, t2g)
})
bs_kal_gene$abundance <- collapse_genes(bs_kal_gene$abundance, t2g)
bs_gene_summary <- sleuth:::summarize_bootstrap(bs_kal_gene, "est_counts")
## debugging in: compute_correlation(rsem_df, ss_summary, c(transcript_id = "target_id"))
## debug at <text>#16: {
## tmp <- inner_join(df, the_summary, by = which_id)
## tmp <- filter(tmp, !is.na(count_var) & is.finite(var_est_counts) &
## is.finite(count_var))
## tmp <- mutate(tmp, count_var = count_var + est_counts)
## with(tmp, cor(var_est_counts, count_var))
## }
## debug at <text>#17: tmp <- inner_join(df, the_summary, by = which_id)
## debug at <text>#18: tmp <- filter(tmp, !is.na(count_var) & is.finite(var_est_counts) &
## is.finite(count_var))
## debug at <text>#19: tmp <- mutate(tmp, count_var = count_var + est_counts)
## debug at <text>#21: with(tmp, cor(var_est_counts, count_var))
Subsampled:
## Warning: Removed 9562 rows containing missing values (geom_point).
## Warning: Removed 9562 rows containing missing values (geom_point).
Bootstrapped:
## Warning: Removed 7250 rows containing missing values (geom_point).
## Warning: Removed 7250 rows containing missing values (geom_point).
Subsampled:
## Warning: Removed 2209 rows containing missing values (geom_point).
## Warning: Removed 2209 rows containing missing values (geom_point).
bootstrapped:
## Warning: Removed 711 rows containing missing values (geom_point).
## Warning: Removed 711 rows containing missing values (geom_point).
sessionInfo()
## R version 3.2.3 (2015-12-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 14.04.3 LTS
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] utils stats methods base
##
## other attached packages:
## [1] sleuth_0.27.3 dplyr_0.4.3 ggplot2_2.0.0 knitr_1.12.3
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.3 GenomeInfoDb_1.4.3 formatR_1.2.1
## [4] plyr_1.8.3 bitops_1.0-6 tools_3.2.3
## [7] grDevices_3.2.3 zlibbioc_1.14.0 biomaRt_2.24.1
## [10] digest_0.6.9 evaluate_0.8 RSQLite_1.0.0
## [13] gtable_0.1.2 rhdf5_2.12.0 DBI_0.3.1
## [16] yaml_2.1.13 parallel_3.2.3 stringr_1.0.0
## [19] IRanges_2.2.9 S4Vectors_0.6.6 graphics_3.2.3
## [22] datasets_3.2.3 stats4_3.2.3 grid_3.2.3
## [25] data.table_1.9.6 Biobase_2.28.0 R6_2.1.2
## [28] AnnotationDbi_1.30.1 XML_3.98-1.3 rmarkdown_0.9.2
## [31] tidyr_0.4.1 magrittr_1.5 scales_0.3.0
## [34] htmltools_0.3 BiocGenerics_0.14.0 assertthat_0.1
## [37] colorspace_1.2-6 labeling_0.3 stringi_1.0-1
## [40] RCurl_1.95-4.7 lazyeval_0.1.10 munsell_0.4.2
## [43] chron_2.3-47